This file contains the code from lecture 3 (3_r_base_r_graphics_2023s.pptx). A separate .Rmd/.html has the plotting example code that accompanies 3_visualization_example_in_r_base_graphics_2023s.
R performs mathematical calculations. Often these calculations are in the form of functions.
1 + 2 # comments can be added after hashes
## [1] 3
sqrt(4)
## [1] 2
mean(c(1,2,3,4))
## [1] 2.5
rnorm(n = 2, mean = 0, sd = 1)
## [1] 1.11453472 0.08567202
rnorm(2, 0, 1)
## [1] -1.357778 1.065296
Functions can be nested inside of other functions.
rnorm(n = sqrt(mean(c(2,4,6))), mean = 1 - 1, sd = 1)
## [1] 0.04399856 0.41344068
To get documentation about a function use ?. To search documentation use ??.
?rnorm
??rnorm
It can save information into objects (which are like variables).
a <- 1
a
## [1] 1
a <- 1 + 2
a
## [1] 3
beta <- 2 + 2
beta
## [1] 4
a + beta
## [1] 7
These objects can be vectors of values.
a <- c(1, 2, 3 ,4)
a
## [1] 1 2 3 4
a <- c(T, F, F, F, F, T, F)
a
## [1] TRUE FALSE FALSE FALSE FALSE TRUE FALSE
You can choose subsets of values in a vector.
a <- c("a", "b", "c", "d")
a
## [1] "a" "b" "c" "d"
a[2]
## [1] "b"
a[2:3]
## [1] "b" "c"
You can operate on subsets of values in a vector.
a <- c(1, 2, 3, 4)
a
## [1] 1 2 3 4
a[2] <- a[2] + 1
a
## [1] 1 3 3 4
Using conditions and TRUE and FALSE can be handy.
a <- c(1, 2, 3, 4)
a == 2
## [1] FALSE TRUE FALSE FALSE
a[a == 2]
## [1] 2
a[c(F, T, F, F)]
## [1] 2
R can work with tables of data called data frames.
# Your path to the file will be different!
try(a <- read.csv("/Users/jonathanchernus/Documents/Teaching/2024s/HUGEN2073/lectures/lecture3_4/data3.csv"))
try(a)
## name x y
## 1 Anna 1.1 0.50
## 2 Xiao 1.9 0.75
## 3 José 3.0 0.80
Since data frames are two dimensional, to subset you need two coordinates.
a
## name x y
## 1 Anna 1.1 0.50
## 2 Xiao 1.9 0.75
## 3 José 3.0 0.80
a[2, 3]
## [1] 0.75
a[3, 1]
## [1] "José"
If you leave one of the two coordinates empty, you’ll get a vector:
a
## name x y
## 1 Anna 1.1 0.50
## 2 Xiao 1.9 0.75
## 3 José 3.0 0.80
a[2, ]
## name x y
## 2 Xiao 1.9 0.75
a[, 1]
## [1] "Anna" "Xiao" "José"
You can use conditions and functions within the [s and ]s too.
a
## name x y
## 1 Anna 1.1 0.50
## 2 Xiao 1.9 0.75
## 3 José 3.0 0.80
a[a[, 1] == "Xiao", c("x")]
## [1] 1.9
We often work with a whole column at a time, and there is a shortcut
for that using $.
a
## name x y
## 1 Anna 1.1 0.50
## 2 Xiao 1.9 0.75
## 3 José 3.0 0.80
a$name
## [1] "Anna" "Xiao" "José"
Note these three are exactly equal:
identical(a$name, a[, 1])
## [1] TRUE
identical(a$name, a[, c("name")])
## [1] TRUE
identical(a[, 1], a[, c("name")])
## [1] TRUE
R can also manipulate tables of data by adding columns that are transformations of other columns.
a$z <- a$x + a$y
a
## name x y z
## 1 Anna 1.1 0.50 1.60
## 2 Xiao 1.9 0.75 2.65
## 3 José 3.0 0.80 3.80
R can also manipulate tables of data by adding columns that are transformations of other columns.
a$b <- sqrt(a$z)
a
## name x y z b
## 1 Anna 1.1 0.50 1.60 1.264911
## 2 Xiao 1.9 0.75 2.65 1.627882
## 3 José 3.0 0.80 3.80 1.949359
You can also add new data to a data frame.
a$r <- runif(length(a$z))
a
## name x y z b r
## 1 Anna 1.1 0.50 1.60 1.264911 0.1134338
## 2 Xiao 1.9 0.75 2.65 1.627882 0.5640775
## 3 José 3.0 0.80 3.80 1.949359 0.9544028
# write.csv(a, "newdata.csv") # This would write out the data frame as a .csv file
Reading and writing CSV data into and out of R (commented out as it is redundant with above examples.)
# a <- read.csv("data.csv")
# a
# rite.csv(a, "newdata.csv")
R can also create graphics.
plot(a$x, a$y)
a[, c("x", "y")]
## x y
## 1 1.1 0.50
## 2 1.9 0.75
## 3 3.0 0.80
plot(a$x, a$y, type = "o")
barplot(a$z, horiz = TRUE, names.arg=a$name)
To add some R embedded in the narrative text itself (inline): The mean of the 4 values 1, 2, 3, 4 is 4.
So plotting numbers and their squares:
x <- c(1,2,3,4,5,6,7,8,9,10)
y <- x^2
plot(x, y)
plot(x, y, main = "Squares",
xlab = "Integers",
ylab = "Squares",
xlim = c(-10, 110), ylim = c(-10, 100))
Adding some plotting parameters:
x <- c(1,2,3,4,5,6,7,8,9,10)
y <- x^2
plot(x, y)
plot(x, y, main = "Squares",
xlab = "Integers", ylab = "Squares",
xlim = c(-10, 110), ylim = c(-10, 100),
col = c(2,3,4), lty=1)
Let’s add a line between the points, a vertical line at 9 and a horizontal line at 81, and a second set of points:
plot(x, y, main = "Squares",
xlab = "Integers", ylab = "Squares",
xlim = c(-10, 110), ylim = c(-10, 100),
pch = c(15,16,17), col = c(2,3,4), cex = 3, lwd = 2)
lines(x, y, col = 1, lwd = 4, lty = "dotted")
abline(v = 9, lwd = 4, col = 2)
abline(h = 81, lwd = 4, col = 4)
y2 <- x * 9
points(x, y2, col = 1, lwd = 4, cex = 2)
Load tidyverse so we can use the alpha
function later, and read in the data.
# Load package
library("tidyverse")
## ── Attaching core tidyverse packages ──────────────────────── tidyverse 2.0.0 ──
## ✔ dplyr 1.1.4 ✔ readr 2.1.5
## ✔ forcats 1.0.0 ✔ stringr 1.5.1
## ✔ ggplot2 3.5.1 ✔ tibble 3.2.1
## ✔ lubridate 1.9.3 ✔ tidyr 1.3.1
## ✔ purrr 1.0.2
## ── Conflicts ────────────────────────────────────────── tidyverse_conflicts() ──
## ✖ dplyr::filter() masks stats::filter()
## ✖ dplyr::lag() masks stats::lag()
## ℹ Use the conflicted package (<http://conflicted.r-lib.org/>) to force all conflicts to become errors
# Read the data in
# You'll need to download the data and change the path
data <- read.csv("/Users/jonathanchernus/Documents/Teaching/2024s/HUGEN2073/lectures/lecture3_4/20220120-rlm-2073_Data.csv")
Inspect the data:
dim(data)
## [1] 1000 4
head(data)
## id sex age height
## 1 1 F 42 168
## 2 2 M 38 184
## 3 3 M 36 179
## 4 4 F 54 161
## 5 5 M 26 189
## 6 6 F 61 169
tail(data)
## id sex age height
## 995 995 F 37 168
## 996 996 M 44 173
## 997 997 F 25 160
## 998 998 F 52 181
## 999 999 M 50 174
## 1000 1000 F 37 172
summary(data)
## id sex age height
## Min. : 1.0 Length:1000 Min. :25.00 Min. :143.0
## 1st Qu.: 250.8 Class :character 1st Qu.:34.00 1st Qu.:164.0
## Median : 500.5 Mode :character Median :45.00 Median :171.0
## Mean : 500.5 Mean :44.66 Mean :171.6
## 3rd Qu.: 750.2 3rd Qu.:54.00 3rd Qu.:179.0
## Max. :1000.0 Max. :64.00 Max. :204.0
Check out the categorical data and make histograms of the continuous data, experimenting with bin widths.
# Categorical variable
table(data$sex)
##
## F M
## 500 500
# Continuous variables
hist(data$age)
hist(data$height)
# Best bin width?
hist(data$height, breaks = 10)
hist(data$height, breaks = 100)
hist(data$height, breaks = 1000)
Making a scatterplot with plot:
# Scatterplots
plot(x = data$age, y = data$height)
Changing the alpha/opacity setting:
# Change point settings
# alpha is a transparency parameter (25% opaque)
plot(data$age, data$height,
pch = 16,
col = alpha("black", 0.25))
Stratify by sex, making separate plots for males and females. What is the “problem” here?
# Females: dark green triangles
plot(data$age[data$sex == "F"],
data$height[data$sex == "F"],
pch = 17, col = alpha("darkgreen", 0.25),
xlab="Age", ylab="Height")
# Males: purple circles
plot(data$age[data$sex == "M"],
data$height[data$sex == "M"],
pch = 16, col = alpha("purple", 0.25),
xlab="Age", ylab="Height")
Now try with the same y-axis:
# What is the full range of heights?
summary(data$height)
## Min. 1st Qu. Median Mean 3rd Qu. Max.
## 143.0 164.0 171.0 171.6 179.0 204.0
# Plot again, this time with the same y-axis
# Females first
plot(data$age[data$sex == "F"],
data$height[data$sex == "F"],
pch = 17, col = alpha("darkgreen", 0.25),
xlab="Age", ylab="Height",
ylim = c(143, 204))
# Now males
plot(data$age[data$sex == "M"],
data$height[data$sex == "M"],
pch = 16, col = alpha("purple", 0.25),
xlab="Age", ylab="Height",
ylim = c(143, 204))
Now try a single plotting window, first using plot and
then points. What’s wrong here?
# Plot females
plot(data$age[data$sex == "F"],
data$height[data$sex == "F"],
pch = 17, col = alpha("darkgreen", 0.25),
xlab="Age", ylab="Height")
# Add points for males
points(data$age[data$sex == "M"],
data$height[data$sex == "M"],
pch = 16, col = alpha("purple", 0.25))
Overlay again, but set ylim correctly in the call to
plot:
# View full range of heights
summary(data$height)
## Min. 1st Qu. Median Mean 3rd Qu. Max.
## 143.0 164.0 171.0 171.6 179.0 204.0
# Plot females
plot(data$age[data$sex == "F"],
data$height[data$sex == "F"],
pch = 17, col = alpha("darkgreen", 0.25),
xlab="Age", ylab="Height",
ylim = c(143, 204))
# Overlay males
points(data$age[data$sex == "M"],
data$height[data$sex == "M"],
pch = 16, col = alpha("purple", 0.25))
Now try to use only a single call to plot. We have to
make vectors of plotting parameters:
# Create vectors containing pch and col values
data$pch[data$sex == "F"] <- 17
data$pch[data$sex == "M"] <- 16
data$col[data$sex == "F"] <- "darkgreen"
data$col[data$sex == "M"] <- "purple"
# Check that this worked right
head(data)
## id sex age height pch col
## 1 1 F 42 168 17 darkgreen
## 2 2 M 38 184 16 purple
## 3 3 M 36 179 16 purple
## 4 4 F 54 161 17 darkgreen
## 5 5 M 26 189 16 purple
## 6 6 F 61 169 17 darkgreen
table(data$sex, data$pch)
##
## 16 17
## F 0 500
## M 500 0
table(data$sex, data$col)
##
## darkgreen purple
## F 500 0
## M 0 500
Now use a single plot command:
# Use the vectors created above
plot(data$age, data$height,
pch = data$pch,
col = alpha(data$col, 0.25),
xlab="Age", ylab="Height")